# 请按照您的真实环境修改 set_env.sh 路径
source set_env.sh

MG_DATASET_PATH=/home/ma-user/work/dataset/mg_data/math-8k-instruct-7B/
mkdir -p $MG_DATASET_PATH

python ./preprocess_data.py \
	--input /home/ma-user/work/users/qzh/codes/MindSpeed-LLM-hxj/math-8k/math.8k.json \
	--tokenizer-name-or-path /home/ma-user/work/models/hf_models/Qwen/Qwen2.5-7B-Instruct/ \
	--output-prefix $MG_DATASET_PATH \
	--handler-name AlpacaStyleInstructionHandler \
	--tokenizer-type PretrainedFromHF \
	--workers 16 \
	--log-interval 1000 \
	--prompt-type qwen_r1 \
	--map-keys '{"prompt":"prompt","query":"","response":"answer"}' # 默认值，可不传
  # --map-keys '{"prompt":"instruction","query":"input","response":"output"}' # 默认值，可不传